This ipython notebook covers the possibilities to preprocess scanned medical documents to be used with OCR software such as Tesseract.
import cv2
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import os
import time
%matplotlib inline
print("OpenCV Version : %s " % cv2.__version__)
def rotate(image, angle, scale=1.0):
"""Rotate image by angle and scale.
"""
h, w = image.shape[:2]
center = (w//2, h//2)
M = cv2.getRotationMatrix2D(center, angle, scale)
# fill in 255 (white) as border during rotation
rotated = cv2.warpAffine(image, M, (w, h), borderValue=255)
return rotated
def pad(image, h_margin=100, w_margin=100):
"""Pad 2D image by `margin` pixels on four sides.
"""
assert len(image.shape) == 2, 'Image is not 2D!'
h, w = image.shape
padded = np.ones((h + 2 * h_margin, w + 2 * w_margin), dtype='uint8') * 255
padded[h_margin : (h_margin + h), w_margin : (w_margin + w)] = image
return padded
def order_points(pts):
"""Reorder an array of 4 coordinates.
The reordered list is in the order of top-left, top-right,
bottom-right, and bottom-left.
"""
rect = np.zeros((4, 2), dtype = "float32")
# the top-left point will have the smallest sum
# the bottom-right point will have the largest sum
s = pts.sum(axis = 1)
rect[0] = pts[np.argmin(s)]
rect[2] = pts[np.argmax(s)]
# the top-right point will have the smallest difference
# the bottom-left will have the largest difference
diff = np.diff(pts, axis = 1)
rect[1] = pts[np.argmin(diff)]
rect[3] = pts[np.argmax(diff)]
return rect
def four_point_transform(image, pts):
"""Perspective transformation a region of interest in image.
"""
# reorder the points first
rect = order_points(pts)
(tl, tr, br, bl) = rect
# compute the width of the new image
widthA = np.sqrt(((br[0] - bl[0]) ** 2) + ((br[1] - bl[1]) ** 2))
widthB = np.sqrt(((tr[0] - tl[0]) ** 2) + ((tr[1] - tl[1]) ** 2))
maxWidth = max(int(widthA), int(widthB))
# compute the height of the new image
heightA = np.sqrt(((tr[0] - br[0]) ** 2) + ((tr[1] - br[1]) ** 2))
heightB = np.sqrt(((tl[0] - bl[0]) ** 2) + ((tl[1] - bl[1]) ** 2))
maxHeight = max(int(heightA), int(heightB))
# construct destination canvas
dst = np.array([
[0, 0],
[maxWidth - 1, 0],
[maxWidth - 1, maxHeight - 1],
[0, maxHeight - 1]], dtype = "float32")
# compute the perspective transform matrix and then apply it
M = cv2.getPerspectiveTransform(rect, dst)
warped = cv2.warpPerspective(image, M, (maxWidth, maxHeight))
# return the warped image
return warped
def box_height(box):
"""Find the height of a bounding box.
Input `box` contains the coordinates of four corner points of the box.
"""
points = order_points(box)
height = ((points[0][0] - points[-1][0])**2 + (points[0][1] - points[-1][1])**2)**0.5
return height
def box_aspect_ratio(box):
"""Find the aspect ratio of a bounding box.
Input `box` contains the coordinates of four corner points of the box.
"""
points = order_points(box)
height = ((points[0][0] - points[-1][0])**2 + (points[0][1] - points[-1][1])**2)**0.5
width = ((points[0][0] - points[1][0])**2 + (points[0][1] - points[1][1])**2)**0.5
aspect_ratio = height * 1. / width
return aspect_ratio
filenames = [r'./testImages/1000992520057_3007_1.jpg',
r'./testImages/1000995056687_3007_1.jpg',
r'./testImages/1000967244029_3007_1.jpg',
r'./testImages/1000968571699_3007_2.jpg',
]
image = cv2.imread(filenames[0])
# convert to gray scale and visualize
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
matplotlib.rcParams['figure.figsize'] = (5.0, 10.0)
plt.imshow(gray, cmap='gray')
# # Note: unfortunately equalization of histogram does not work with document images!
# equ = cv2.equalizeHist(gray)
# plt.imshow(np.hstack((gray, equ)), cmap='gray')
# Combination of global thresholding...
(T, thresh1) = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY)
# plt.imshow(thresh1, cmap='gray')
# And adaptive thresholding!
thresh2 = cv2.adaptiveThreshold(gray, 255,
cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 15, 9)
# plt.imshow(thresh2, cmap='gray')
# bitwise OR heltps to clean up artifacts at the boundaries
thresh3 = cv2.bitwise_or(thresh1, thresh2)
matplotlib.rcParams['figure.figsize'] = (60.0, 30.0)
plt.imshow(np.hstack((thresh1, thresh2, thresh3)), cmap='gray')
First round performs box detection directly on the preprocessed binary image. This would normally get pretty good result. However if the scanning quality is poor, we use the first round to perform a clean-up of borders and do a second round.
thresh = thresh3.copy()
# inversion to make characters non-zero
image = 255 - thresh
# morph
struct_elem = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 1))
image = cv2.dilate(image.copy(), struct_elem, iterations=2)
# find contours
im2, cnts, hierarchy = cv2.findContours(image.copy().astype(np.uint8),
cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = [cnt for cnt in cnts if cv2.contourArea(cnt) > 1400]
# compute and draw bounding box
canvas = thresh.copy()
c = sorted(cnts, key=cv2.contourArea, reverse=True)
boxes = []
for i in range(len(c)):
rect = cv2.minAreaRect(c[i])
box = np.int0(cv2.boxPoints(rect))
boxes.append(box)
cv2.drawContours(canvas, [box], -1, 0, 3)
matplotlib.rcParams['figure.figsize'] = (20.0, 20.0)
plt.imshow(canvas, cmap='gray')
Now we inspect the statistics of the box locations, which gives us good information regarding font size, left border, right border, rotation angle (if the texts are tilted), etc.
Second round find the most popular left and right boundaries of bounding boxes, and then crop the original image
heights = []
lefts = []
rights = []
tops = []
bottoms = []
angles = []
for box in boxes:
points = order_points(box)
height = ((points[0][0] - points[-1][0])**2 + (points[0][1] - points[-1][1])**2)**0.5
heights.append(height)
angle = np.arctan((points[1][1] - points[0][1])* 1.0 / (points[1][0] - points[0][0])) / 3.14 * 180
angles.append(angle)
lefts.append(points[0][0])
rights.append(points[2][0])
tops.append(points[0][1])
bottoms.append(points[2][1])
# the median height of the bounding boxes is the fontsize in pixels
fontsize = int(np.median(heights))
rotate_angle = np.median(angles)
print('Font size is {} pixels.'.format(fontsize))
print('Rotate {:.2f} degrees counterclockwise to correct text tilt.'.format(rotate_angle))
lefts = sorted(lefts)
n_neighbors = [np.sum(np.abs(lefts[idx:idx+10] - lefts[idx]) < 10) for idx in range(len(lefts))]
idx = np.argmax(n_neighbors)
left_border = int(lefts[idx])
rights = sorted(rights)
n_neighbors = [np.sum(np.abs(rights[idx:idx+10] - rights[idx]) < 10) for idx in range(len(rights))]
idx = np.argmax(n_neighbors)
if (idx + 10) < len(rights):
right_border = int(rights[idx+10])
else:
right_border = canvas.shape[1] - 1
top_border = max(min(tops), 0)
bottom_border = max(bottoms)
print('Left {}, Right {}, Top {}, Bottom {}'.format(left_border, right_border, top_border, bottom_border))
cropped = thresh3[:, left_border:right_border]
# pad documents for visualization
cropped = pad(cropped)
plt.imshow(cropped, cmap='gray')
We can draw the bounding boxes with openCV and feed tesseract with each extracted box.
# inversion to make characters non-zero
image = rotate(cropped, rotate_angle)
clean_canvas = image.copy()
canvas = clean_canvas.copy()
image = 255 - image
# morph
struct_elem = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 1))
image = cv2.dilate(image.copy(), struct_elem, iterations=2)
# find contours
im2, cnts, hierarchy = cv2.findContours(image.copy().astype(np.uint8),
cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = [cnt for cnt in cnts if cv2.contourArea(cnt) > 1400]
# compute and draw bounding box
c = sorted(cnts, key=cv2.contourArea, reverse=True)
boxes = []
for i in range(len(c)):
rect = cv2.minAreaRect(c[i])
box = np.int0(cv2.boxPoints(rect))
if box_height(box) > fontsize * 0.5 and box_aspect_ratio(box) <= 1.1:
# text boxes should be short and wide
boxes.append(box)
cv2.drawContours(canvas, [box], -1, 0, 3)
matplotlib.rcParams['figure.figsize'] = (20.0, 20.0)
plt.imshow(canvas, cmap='gray')
The coordinates of the boxes are listed in the array boxes. To extract a certain box, simply do the following perspective transformation.
roi = four_point_transform(canvas.copy(), boxes[10])
plt.imshow(roi, cmap='gray')
The results seem pretty good at this stage. The final results depend on whether tesseract is more susceptible to false positives (returning a box containing only artifacts but no characters) or false negatives (missing to detect and box certain characters). This would require knowledge of the entire pipeline.
One particular problem we observe so far is that sometimes official stamps/seals overlap with the text area, and the above algorithm would return a box containing multiple lines of characters, as in Images0 and Image3. This may or may not be a problem depending on the OCR algorithm. In case we would like to return boxes with at most one line of characters, we can proceed to perform the following delineation.
canvas = clean_canvas.copy()
canvas1 = clean_canvas.copy()
oversized_boxes = []
old_boxes = []
for box in boxes:
# Only inspect boxes with height larger than twice the fontsize
# otherwise remove the box and its contents
if box_height(box) <= 2 * fontsize:
old_boxes.append(box)
cv2.fillPoly(canvas1, [box], 255)
cv2.fillPoly(canvas, [box], 255)
else:
oversized_boxes.append(box)
for box in oversized_boxes:
cv2.drawContours(canvas1, [box], -1, 0, 3)
plt.imshow(canvas1, cmap='gray')
Let's take a look at a specific example.
for box in oversized_boxes[:1]:
roi = four_point_transform(canvas.copy(), box)
plt.imshow(roi, cmap='gray')
delineate_flag = 1
image = 255 - canvas
# morph
struct_elem = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 1))
image = cv2.dilate(image.copy(), struct_elem, iterations=2)
delineated = image.copy()
if delineate_flag:
profile = np.sum(image, axis=1)
profile = (profile / np.max(profile) * 255).astype(np.uint8)
profile_img = np.array([profile, profile])
minima_window = cv2.adaptiveThreshold(profile_img, 255,
cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY_INV, 51, 10)
minima_window = minima_window[1, :]
x1 = np.where(np.diff(minima_window.astype(np.float)) > 0)[0]
x2 = np.where(np.diff(minima_window.astype(np.float)) < 0)[0]
# clean border area
if x1[0] > x2[0]:
x2 = x2[1:]
if x1[-1] > x2[-1]:
x1 = x1[1:]
assert len(x1) == len(x2)
minima = []
for i in range(len(x1)):
if np.min(profile[x1[i]:x2[i]]) > 0:
minima.append(np.argmin(profile[x1[i]:x2[i]]) + x1[i])
for pt in minima:
delineated[pt-5:pt+5, :] = 0
plt.imshow(np.hstack((image, delineated)), cmap='gray')
canvas = clean_canvas.copy()
# find contours
im2, cnts, hierarchy = cv2.findContours(delineated.copy().astype(np.uint8),
cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = [cnt for cnt in cnts if cv2.contourArea(cnt) > 1400]
# compute and draw bounding box
c = sorted(cnts, key=cv2.contourArea, reverse=True)
new_boxes = []
for i in range(len(c)):
rect = cv2.minAreaRect(c[i])
box = np.int0(cv2.boxPoints(rect))
if box_height(box) > fontsize * 0.5:
new_boxes.append(box)
# combining the old and new boxes
boxes = old_boxes + new_boxes
for box in boxes:
cv2.drawContours(canvas, [box], -1, 0, 3)
matplotlib.rcParams['figure.figsize'] = (20.0, 20.0)
plt.imshow(canvas, cmap='gray')
Some more ideas in extracting text boxes: